In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, linear_model

%matplotlib inline

In [2]:
sales = pd.read_csv('home_data.csv')

In [3]:
train_data = sales.sample(frac=0.8)

In [4]:
from sklearn.cross_validation import train_test_split

test_data, train_data = train_test_split(sales, test_size=0.8, random_state=42)

Basic commands for data analysis


In [5]:
len(test_data)


Out[5]:
4322

In [6]:
len(train_data)


Out[6]:
17291

In [7]:
sales.describe()


Out[7]:
id price bedrooms bathrooms sqft_living sqft_lot floors waterfront view condition grade sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
count 2.161300e+04 2.161300e+04 21613.000000 21613.000000 21613.000000 2.161300e+04 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000 21613.000000
mean 4.580302e+09 5.400881e+05 3.370842 2.114757 2079.899736 1.510697e+04 1.494309 0.007542 0.234303 3.409430 7.656873 1788.390691 291.509045 1971.005136 84.402258 98077.939805 47.560053 -122.213896 1986.552492 12768.455652
std 2.876566e+09 3.671272e+05 0.930062 0.770163 918.440897 4.142051e+04 0.539989 0.086517 0.766318 0.650743 1.175459 828.090978 442.575043 29.373411 401.679240 53.505026 0.138564 0.140828 685.391304 27304.179631
min 1.000102e+06 7.500000e+04 0.000000 0.000000 290.000000 5.200000e+02 1.000000 0.000000 0.000000 1.000000 1.000000 290.000000 0.000000 1900.000000 0.000000 98001.000000 47.155900 -122.519000 399.000000 651.000000
25% 2.123049e+09 3.219500e+05 3.000000 1.750000 1427.000000 5.040000e+03 1.000000 0.000000 0.000000 3.000000 7.000000 1190.000000 0.000000 1951.000000 0.000000 98033.000000 47.471000 -122.328000 1490.000000 5100.000000
50% 3.904930e+09 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 0.000000 0.000000 3.000000 7.000000 1560.000000 0.000000 1975.000000 0.000000 98065.000000 47.571800 -122.230000 1840.000000 7620.000000
75% 7.308900e+09 6.450000e+05 4.000000 2.500000 2550.000000 1.068800e+04 2.000000 0.000000 0.000000 4.000000 8.000000 2210.000000 560.000000 1997.000000 0.000000 98118.000000 47.678000 -122.125000 2360.000000 10083.000000
max 9.900000e+09 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 1.000000 4.000000 5.000000 13.000000 9410.000000 4820.000000 2015.000000 2015.000000 98199.000000 47.777600 -121.315000 6210.000000 871200.000000

In [8]:
sales['bedrooms'].unique()


Out[8]:
array([ 3,  2,  4,  5,  1,  6,  7,  0,  8,  9, 11, 10, 33], dtype=int64)

In [9]:
sales.count()


Out[9]:
id               21613
date             21613
price            21613
bedrooms         21613
bathrooms        21613
sqft_living      21613
sqft_lot         21613
floors           21613
waterfront       21613
view             21613
condition        21613
grade            21613
sqft_above       21613
sqft_basement    21613
yr_built         21613
yr_renovated     21613
zipcode          21613
lat              21613
long             21613
sqft_living15    21613
sqft_lot15       21613
dtype: int64

In [10]:
sales['renovated'] = sales['yr_renovated'] > 0

In [11]:
sales.head()


Out[11]:
id date price bedrooms bathrooms sqft_living sqft_lot floors waterfront view ... sqft_above sqft_basement yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15 renovated
0 7129300520 20141013T000000 221900 3 1.00 1180 5650 1.0 0 0 ... 1180 0 1955 0 98178 47.5112 -122.257 1340 5650 False
1 6414100192 20141209T000000 538000 3 2.25 2570 7242 2.0 0 0 ... 2170 400 1951 1991 98125 47.7210 -122.319 1690 7639 True
2 5631500400 20150225T000000 180000 2 1.00 770 10000 1.0 0 0 ... 770 0 1933 0 98028 47.7379 -122.233 2720 8062 False
3 2487200875 20141209T000000 604000 4 3.00 1960 5000 1.0 0 0 ... 1050 910 1965 0 98136 47.5208 -122.393 1360 5000 False
4 1954400510 20150218T000000 510000 3 2.00 1680 8080 1.0 0 0 ... 1680 0 1987 0 98074 47.6168 -122.045 1800 7503 False

5 rows × 22 columns


In [12]:
sales.plot.scatter(x='sqft_living', y='price')


Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1736507c358>

In [13]:
sales.index


Out[13]:
RangeIndex(start=0, stop=21613, step=1)

In [14]:
sales.columns


Out[14]:
Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15', 'renovated'],
      dtype='object')

In [15]:
sales.values


Out[15]:
array([[7129300520, '20141013T000000', 221900, ..., 1340, 5650, False],
       [6414100192, '20141209T000000', 538000, ..., 1690, 7639, True],
       [5631500400, '20150225T000000', 180000, ..., 2720, 8062, False],
       ..., 
       [1523300141, '20140623T000000', 402101, ..., 1020, 2007, False],
       [291310100, '20150116T000000', 400000, ..., 1410, 1287, False],
       [1523300157, '20141015T000000', 325000, ..., 1020, 1357, False]], dtype=object)

In [16]:
sales.index


Out[16]:
RangeIndex(start=0, stop=21613, step=1)

In [19]:
bed_count = sales.bedrooms.value_counts()

In [20]:
type(bed_count)


Out[20]:
pandas.core.series.Series

In [21]:
bed_count


Out[21]:
3     9824
4     6882
2     2760
5     1601
6      272
1      199
7       38
8       13
0       13
9        6
10       3
11       1
33       1
Name: bedrooms, dtype: int64

In [22]:
bed_count.sort_index().plot()


Out[22]:
<matplotlib.axes._subplots.AxesSubplot at 0x17366b2ebe0>

In [24]:
sales.ix[10]


Out[24]:
id                    1736800520
date             20150403T000000
price                     662500
bedrooms                       3
bathrooms                    2.5
sqft_living                 3560
sqft_lot                    9796
floors                         1
waterfront                     0
view                           0
condition                      3
grade                          8
sqft_above                  1860
sqft_basement               1700
yr_built                    1965
yr_renovated                   0
zipcode                    98007
lat                      47.6007
long                    -122.145
sqft_living15               2210
sqft_lot15                  8925
renovated                  False
Name: 10, dtype: object

In [54]:
regr_one_feature = linear_model.LinearRegression()

In [55]:
training_data_features = train_data["sqft_living"].values

In [56]:
training_data_features.shape


Out[56]:
(17291,)

In [70]:
training_data_features = np.array([train_data.sqft_living]).T

In [71]:
training_data_features.shape


Out[71]:
(17291, 1)

In [72]:
training_data_targets = train_data["price"].values

In [73]:
training_data_targets.shape


Out[73]:
(17291,)

In [74]:
training_data_targets = np.array([train_data.price]).T

In [75]:
training_data_targets.shape


Out[75]:
(17291, 1)

In [76]:
regr_one_feature.fit(training_data_features, training_data_targets)


Out[76]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [78]:
regr_one_feature.coef_


Out[78]:
array([[ 285.46625209]])

In [79]:
# Make predictions using the model and the data we set aside
test_data_features = np.array([test_data.sqft_living]).T
test_data_targets = np.array([test_data.price]).T

In [80]:
# Let's look at the variance (1 is a perfect prediction)
regr_one_feature.score(test_data_features, test_data_targets)


Out[80]:
0.48194026314776461

In [93]:
import math
np.mean((regr_one_feature.predict(test_data_features) - test_data_targets) ** 2)


Out[93]:
60335964329.627296

In [90]:
plt.scatter(test_data_features, test_data_targets, color='blue')
plt.plot(test_data_features, regr_one_feature.predict(test_data_features), color='red', linewidth=3)


Out[90]:
[<matplotlib.lines.Line2D at 0x17367e9a518>]

Metrics


In [91]:
from sklearn.metrics import  mean_squared_error # Same as the computation above the plot
mean_squared_error(test_data_targets, regr_one_feature.predict(test_data_features))


Out[91]:
60335964329.627296

In [94]:
regr_one_feature.coef_


Out[94]:
array([[ 285.46625209]])

In [96]:
regr_one_feature.intercept_


Out[96]:
array([-53461.50313129])

In [ ]: